In [2]:
%matplotlib inline

In [3]:
import requests
from bs4 import BeautifulSoup
import pprint

def get_content(url):
    """ Grab html content from url """
    response = requests.get(url)
    html = response.content
    return html

def find_speakers(html):
    """ Find the speakers and talk info in the html """
    soup = BeautifulSoup(html)
    speakers = soup.body.article.ul.findAll('li')
    return speakers
  • AJ Vicens

    Mother Jones

  • 
    
    In [4]:
    def parse_speakers(speakers):
        """ Parse speakers that were found in HTML """
        
        data = {}
        for row in speakers:
            # Make sure we don't have any data left from previous loop
            speaker = org = talks = None
    
            # Grab speaker and org
            speaker = row.find("h3").string
            if speaker:
                org = row.find('p').string
    
            # Grab talk titles and talk links
            talks = {}
            talk_data = row.findAll('a', href=True)
            for talk in talk_data:
                link = 'http://ire.org' + talk['href']
                title = talk.string
                talks[title] = link
    
            # Add everything to a dictionary
            if speaker and talks:
                data[speaker] = {
                        'org' : org,
                        'talks' : talks, 
                    }
        return data
    
    
    
    In [47]:
    # Sample output of 5 records
    #import random
    
    # select random keys from the dictionary
    #random_keys = random.sample(data.keys(), 3)
    
    # loop over keys and select
    #for key in random_keys:
    #   print('\n', key)
    #   pprint.pprint(data[key])
    
    
    
    In [48]:
    # Process borrowed from here: https://github.com/scrapinghub/pycon-speakers/blob/432499e350098c69d4b3e0f641c960d927ec596d/pycon_speakers/pipelines.py
    
    import sexmachine.detector as gender
    
    def get_gender(name):
        firstname = name.split()[0]
        d = gender.Detector()
        name_gender = d.get_gender(firstname)
        return name_gender
    
    def count_genders(names):
        gender_count = {}
        names = data.keys()
        for name in names:
            name_gender = get_gender(name)
            try:
                gender_count[name_gender] += 1
            except KeyError:
                gender_count[name_gender] = 1
    
        print gender_count
        return gender_count
    
    
    
    In [49]:
    urls = (
        (2015, 'http://ire.org/conferences/nicar2015/speakers/'), 
        (2014, 'http://ire.org/conferences/nicar-2014/speakers/'),
        (2013, 'http://ire.org/conferences/nicar-2013/speakers/'),
        )
    
    
    
    In [50]:
    counts = {}
    for url in urls:
        html = get_content(url[1])
        speakers = find_speakers(html)
        data = parse_speakers(speakers)
        count = count_genders(data)
        
        counts[url[0]] = count
        
    pprint.pprint(counts)
    
    
    
    
    {u'mostly_male': 12, u'male': 143, u'andy': 24, u'female': 69, u'mostly_female': 4}
    {u'mostly_male': 13, u'male': 137, u'andy': 14, u'mostly_female': 3, u'female': 61}
    {u'mostly_male': 4, u'male': 98, u'andy': 10, u'mostly_female': 2, u'female': 35}
    {2013: {u'andy': 10,
            u'female': 35,
            u'male': 98,
            u'mostly_female': 2,
            u'mostly_male': 4},
     2014: {u'andy': 14,
            u'female': 61,
            u'male': 137,
            u'mostly_female': 3,
            u'mostly_male': 13},
     2015: {u'andy': 24,
            u'female': 69,
            u'male': 143,
            u'mostly_female': 4,
            u'mostly_male': 12}}
    
    
    
    In [5]:
    out = {2013: {u'andy': 10,
            u'female': 35,
            u'male': 98,
            u'mostly_female': 2,
            u'mostly_male': 4},
     2014: {u'andy': 14,
            u'female': 61,
            u'male': 137,
            u'mostly_female': 3,
            u'mostly_male': 13},
     2015: {u'andy': 24,
            u'female': 69,
            u'male': 143,
            u'mostly_female': 4,
            u'mostly_male': 12}}
    
    
    
    In [6]:
    timedata = []
    for k,v in out.iteritems():
        male = v['mostly_male'] + v['male']
        female = v['mostly_female'] + v['female']
        total = male + female
        
        timedata.append((k, female*1.0/total, male*1.0/total))
    print timedata
    
    
    
    
    [(2013, 0.26618705035971224, 0.7338129496402878), (2014, 0.29906542056074764, 0.7009345794392523), (2015, 0.3201754385964912, 0.6798245614035088)]
    
    
    
    In [7]:
    import pandas as pd
    
    # Turn the years into the index. 
    # There is probably a better way to do this. 
    years = [i[0] for i in timedata]
    values = [i[1:3] for i in timedata]
    df = pd.DataFrame(values, index=years)
    print df
    
    
    
    
                 0         1
    2013  0.266187  0.733813
    2014  0.299065  0.700935
    2015  0.320175  0.679825
    
    
    
    In [8]:
    import matplotlib.pyplot as plt
    
    plt.figure()
    df.plot()
    plt.legend(loc='best')
    plt.show()
    
    
    
    
    <matplotlib.figure.Figure at 0x106b24250>